{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing Inputs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:21:09.395216Z",
     "start_time": "2018-11-22T01:21:08.056985Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import gc\n",
    "import glob\n",
    "import itertools\n",
    "import os\n",
    "import pickle\n",
    "\n",
    "import joblib\n",
    "import keras\n",
    "import numpy as np\n",
    "from scipy import sparse\n",
    "from tqdm import tqdm_notebook\n",
    "\n",
    "import utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:23:08.981084Z",
     "start_time": "2018-11-22T01:21:09.396921Z"
    }
   },
   "outputs": [],
   "source": [
    "train = joblib.load(\"train.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:23:09.035588Z",
     "start_time": "2018-11-22T01:23:08.983021Z"
    }
   },
   "outputs": [],
   "source": [
    "encoder = utils.ClaimEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:23:12.322052Z",
     "start_time": "2018-11-22T01:23:09.038017Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num Distinct Claims 109810\n",
      "Num Data Points 125050\n"
     ]
    }
   ],
   "source": [
    "claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data(\"../train.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-18T22:56:17.758716Z",
     "start_time": "2018-11-18T22:56:17.753244Z"
    }
   },
   "outputs": [],
   "source": [
    "l = [sparse.csr_matrix((10,5)), sparse.csr_matrix((10,15))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-18T22:53:21.359020Z",
     "start_time": "2018-11-18T22:53:21.353874Z"
    }
   },
   "outputs": [],
   "source": [
    "nm = sparse.csr_matrix((5, 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:33:20.131868Z",
     "start_time": "2018-11-22T01:33:20.121755Z"
    }
   },
   "outputs": [],
   "source": [
    "def stack_uneven(arrays, fill_value=0.):\n",
    "    '''\n",
    "    Fits arrays into a single numpy array, even if they are\n",
    "    different sizes. `fill_value` is the default value.\n",
    "\n",
    "    Args:\n",
    "            arrays: list of np arrays of various sizes\n",
    "                (must be same rank, but not necessarily same size)\n",
    "            fill_value (float, optional):\n",
    "\n",
    "    Returns:\n",
    "            np.ndarray\n",
    "    '''\n",
    "    sizes = [a.shape for a in arrays]\n",
    "    max_sizes = np.max(list(zip(*sizes)), -1)\n",
    "    # The resultant array has stacked on the first dimension\n",
    "    result = np.zeros((len(arrays),) + tuple(max_sizes))\n",
    "    for i, a in enumerate(arrays):\n",
    "      # The shape of this array `a`, turned into slices\n",
    "      slices = tuple(slice(0,s) for s in sizes[i])\n",
    "      # Overwrite a block slice of `result` with this array `a`\n",
    "      result[i][slices] = a\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T02:08:21.884705Z",
     "start_time": "2018-11-22T02:08:21.847025Z"
    }
   },
   "outputs": [],
   "source": [
    "class DataGenerator(keras.utils.Sequence):\n",
    "    \"\"\"\n",
    "    Generates data with batch size of 1 sample for the purposes of training our model.\n",
    "    \"\"\"\n",
    "    def __init__(self, data, batch_size=32, split=None):\n",
    "        \"\"\"\n",
    "            Sets the initial arguments and creates\n",
    "            an indicies array to randomize the dataset\n",
    "            between epochs\n",
    "        \"\"\"\n",
    "        if split:            \n",
    "            self.indicies = split\n",
    "        else:\n",
    "            self.indicies = list(range(len(data)))\n",
    "        self.data = data\n",
    "        encoder = utils.ClaimEncoder()\n",
    "        self.batch_size = batch_size\n",
    "        _, _, _, _, self.claim_to_article = utils.extract_fever_jsonl_data(\"../train.jsonl\")\n",
    "        \n",
    "    def __len__(self):\n",
    "        return len(self.data)\n",
    "    \n",
    "    def __getitem__(self, index):\n",
    "        return self.get_item(index)\n",
    "    \n",
    "    def get_item(self, index):            \n",
    "        d = self.data[index]\n",
    "        claim = sparse.vstack(encoder.tokenize_claim(d['claim'])).toarray()\n",
    "        evidences = []\n",
    "        ys = []\n",
    "        for e in d['evidence']:\n",
    "            processed = utils.preprocess_article_name(e.split(\"http://wikipedia.org/wiki/\")[1])\n",
    "            tokenized = encoder.tokenize_claim(processed)\n",
    "            dense_arrs = [i.toarray() for i in tokenized]\n",
    "            stacked_arr = np.vstack(dense_arrs)\n",
    "            #stacked_arr = stacked_arr.toarray()\n",
    "            evidences.append(stacked_arr)\n",
    "            if processed in self.claim_to_article[d['claim']]:\n",
    "                ys.append(1)\n",
    "            else:\n",
    "                ys.append(0)\n",
    "\n",
    "        evidences = stack_uneven(evidences)\n",
    "        claim = claim[np.newaxis, :, :]\n",
    "        claim = np.broadcast_to(claim, (len(evidences), claim.shape[1], claim.shape[2]))\n",
    "        return {\"claim\": claim, \"document\":evidences}, np.array(ys)\n",
    "    \n",
    "    def on_epoch_end(self):\n",
    "        #np.random.shuffle(self.indicies)\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T02:08:26.232741Z",
     "start_time": "2018-11-22T02:08:22.881283Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num Distinct Claims 109810\n",
      "Num Data Points 125050\n"
     ]
    }
   ],
   "source": [
    "gen = DataGenerator(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:26:29.102210Z",
     "start_time": "2018-11-22T01:26:29.079139Z"
    }
   },
   "outputs": [],
   "source": [
    "%load_ext line_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:42:34.328282Z",
     "start_time": "2018-11-22T01:42:33.219694Z"
    }
   },
   "outputs": [],
   "source": [
    "%lprun -f utils.ClaimEncoder.tokenize_claim gen.get_item(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T02:08:26.621941Z",
     "start_time": "2018-11-22T02:08:26.234737Z"
    }
   },
   "outputs": [],
   "source": [
    "model = create_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:44:16.055687Z",
     "start_time": "2018-11-22T01:44:16.044760Z"
    }
   },
   "outputs": [],
   "source": [
    "%run deep_semantic_similarity_keras.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-18T22:40:51.729499Z",
     "start_time": "2018-11-18T22:40:51.722681Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(400, 24, 29243)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c[0]['document'].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-18T17:11:16.647705Z",
     "start_time": "2018-11-18T17:11:16.640591Z"
    }
   },
   "outputs": [],
   "source": [
    "c[1].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-17T23:17:04.837402Z",
     "start_time": "2018-11-17T23:17:04.833954Z"
    }
   },
   "outputs": [],
   "source": [
    "claim_to_article[train[4]['claim']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T01:45:16.611286Z",
     "start_time": "2018-11-22T01:45:15.626240Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gc\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-22T02:11:47.544291Z",
     "start_time": "2018-11-22T02:08:30.338887Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/1\n",
      "    12/125051 [..............................] - ETA: 424:06:39 - loss: 15.8826"
     ]
    },
    {
     "ename": "ResourceExhaustedError",
     "evalue": "OOM when allocating tensor with shape[400,29243,1,51] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc\n\t [[{{node training_1/Adam/gradients/conv1d_4/convolution/Conv2D_grad/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32, _device=\"/job:localhost/replica:0/task:0/device:GPU:0\"](conv1d_4/convolution/ExpandDims, PermConstNHWCToNCHW-LayoutOptimizer)]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.\n\n\t [[{{node loss_1/mul/_271}} = _Recv[client_terminated=false, recv_device=\"/job:localhost/replica:0/task:0/device:CPU:0\", send_device=\"/job:localhost/replica:0/task:0/device:GPU:0\", send_device_incarnation=1, tensor_name=\"edge_1021_loss_1/mul\", tensor_type=DT_FLOAT, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"]()]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m~/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1398\u001b[0m               \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1399\u001b[0;31m               run_metadata_ptr)\n\u001b[0m\u001b[1;32m   1400\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mResourceExhaustedError\u001b[0m                    Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-80-9e0b11c82309>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mworkers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_queue_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_multiprocessing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/keras/legacy/interfaces.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     89\u001b[0m                 warnings.warn('Update your `' + object_name + '` call to the ' +\n\u001b[1;32m     90\u001b[0m                               'Keras 2 API: ' + signature, stacklevel=2)\n\u001b[0;32m---> 91\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     92\u001b[0m         \u001b[0mwrapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_function\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     93\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m   1416\u001b[0m             \u001b[0muse_multiprocessing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_multiprocessing\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1417\u001b[0m             \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1418\u001b[0;31m             initial_epoch=initial_epoch)\n\u001b[0m\u001b[1;32m   1419\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1420\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0minterfaces\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlegacy_generator_methods_support\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/keras/engine/training_generator.py\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m    215\u001b[0m                 outs = model.train_on_batch(x, y,\n\u001b[1;32m    216\u001b[0m                                             \u001b[0msample_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 217\u001b[0;31m                                             class_weight=class_weight)\n\u001b[0m\u001b[1;32m    218\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    219\u001b[0m                 \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mto_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mtrain_on_batch\u001b[0;34m(self, x, y, sample_weight, class_weight)\u001b[0m\n\u001b[1;32m   1215\u001b[0m             \u001b[0mins\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0msample_weights\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1216\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_train_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1217\u001b[0;31m         \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1218\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0munpack_singleton\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m   2713\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_legacy_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2714\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2715\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2716\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2717\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mpy_any\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mis_tensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m_call\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m   2673\u001b[0m             \u001b[0mfetched\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_callable_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0marray_vals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_metadata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2674\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2675\u001b[0;31m             \u001b[0mfetched\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_callable_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0marray_vals\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2676\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2677\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1397\u001b[0m           ret = tf_session.TF_SessionRunCallable(\n\u001b[1;32m   1398\u001b[0m               \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1399\u001b[0;31m               run_metadata_ptr)\n\u001b[0m\u001b[1;32m   1400\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1401\u001b[0m           \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py\u001b[0m in \u001b[0;36m__exit__\u001b[0;34m(self, type_arg, value_arg, traceback_arg)\u001b[0m\n\u001b[1;32m    524\u001b[0m             \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    525\u001b[0m             \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc_api\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_Message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 526\u001b[0;31m             c_api.TF_GetCode(self.status.status))\n\u001b[0m\u001b[1;32m    527\u001b[0m     \u001b[0;31m# Delete the underlying status object from memory otherwise it stays alive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    528\u001b[0m     \u001b[0;31m# as there is a reference to status from this from the traceback due to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mResourceExhaustedError\u001b[0m: OOM when allocating tensor with shape[400,29243,1,51] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc\n\t [[{{node training_1/Adam/gradients/conv1d_4/convolution/Conv2D_grad/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32, _device=\"/job:localhost/replica:0/task:0/device:GPU:0\"](conv1d_4/convolution/ExpandDims, PermConstNHWCToNCHW-LayoutOptimizer)]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.\n\n\t [[{{node loss_1/mul/_271}} = _Recv[client_terminated=false, recv_device=\"/job:localhost/replica:0/task:0/device:CPU:0\", send_device=\"/job:localhost/replica:0/task:0/device:GPU:0\", send_device_incarnation=1, tensor_name=\"edge_1021_loss_1/mul\", tensor_type=DT_FLOAT, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"]()]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.\n"
     ]
    }
   ],
   "source": [
    "model.fit_generator(gen, workers=2, max_queue_size=1, use_multiprocessing=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-18T01:49:13.072876Z",
     "start_time": "2018-11-18T01:47:10.425773Z"
    }
   },
   "outputs": [],
   "source": [
    "model.fit_generator(gen, workers=1, max_queue_size=1, pickle_safe=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-14T14:46:52.404207Z",
     "start_time": "2018-11-14T14:46:52.296107Z"
    }
   },
   "outputs": [],
   "source": [
    "pickles[0].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-14T14:46:46.927557Z",
     "start_time": "2018-11-14T14:40:49.109026Z"
    }
   },
   "outputs": [],
   "source": [
    "pickles = []\n",
    "for file in tqdm_notebook(glob.glob(\"*.pkl\")):\n",
    "    pickles.append(joblib.load(file))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"part_\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:32:47.536631Z",
     "start_time": "2018-11-12T05:32:36.946711Z"
    }
   },
   "outputs": [],
   "source": [
    "with open(\"train.pkl\", \"rb\") as f:\n",
    "    train = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:32:47.594039Z",
     "start_time": "2018-11-12T05:32:47.538450Z"
    }
   },
   "outputs": [],
   "source": [
    "encoder = utils.ClaimEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T03:51:53.590323Z",
     "start_time": "2018-11-12T03:51:51.274685Z"
    }
   },
   "outputs": [],
   "source": [
    "claims = [utils.preprocess_article_name(i['claim']) for i in train]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:03:51.270008Z",
     "start_time": "2018-11-12T03:57:49.035673Z"
    }
   },
   "outputs": [],
   "source": [
    "claims_dict = {}\n",
    "for c in tqdm_notebook(claims):\n",
    "    claims_dict[c] = encoder.tokenize_claim(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-14T13:18:09.400642Z",
     "start_time": "2018-11-14T13:18:08.232253Z"
    }
   },
   "outputs": [],
   "source": [
    "with open(\"claims_dict.pkl\", \"rb\") as f:\n",
    "    claims_dict = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-14T13:34:39.618328Z",
     "start_time": "2018-11-14T13:34:39.596966Z"
    }
   },
   "outputs": [],
   "source": [
    "p0[list(claims_dict.keys())[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-14T13:18:24.391489Z",
     "start_time": "2018-11-14T13:18:24.314003Z"
    }
   },
   "outputs": [],
   "source": [
    "claims_dict.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:10:21.776785Z",
     "start_time": "2018-11-12T04:09:28.543161Z"
    }
   },
   "outputs": [],
   "source": [
    "article_names = set()\n",
    "for t in tqdm_notebook(train):\n",
    "    for e in t['evidence']:\n",
    "        name = utils.preprocess_article_name(e)\n",
    "        article_names.add(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:34:02.715855Z",
     "start_time": "2018-11-12T05:34:02.710318Z"
    }
   },
   "outputs": [],
   "source": [
    "def process(t):\n",
    "    return [utils.preprocess_article_name(i.split(\"http://wikipedia.org/wiki/\")[1]) for i in t['evidence']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:29:10.876292Z",
     "start_time": "2018-11-12T04:29:10.854145Z"
    }
   },
   "outputs": [],
   "source": [
    "process(train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:36:05.578174Z",
     "start_time": "2018-11-12T05:34:03.797940Z"
    }
   },
   "outputs": [],
   "source": [
    "names = utils.parallel_process(train, process, n_jobs=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:36:16.586011Z",
     "start_time": "2018-11-12T05:36:05.580307Z"
    }
   },
   "outputs": [],
   "source": [
    "names = list(set(itertools.chain.from_iterable(names)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:36:16.607346Z",
     "start_time": "2018-11-12T05:36:16.587490Z"
    }
   },
   "outputs": [],
   "source": [
    "len(names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:36:16.617966Z",
     "start_time": "2018-11-12T05:36:16.609642Z"
    }
   },
   "outputs": [],
   "source": [
    "names[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:36:16.860975Z",
     "start_time": "2018-11-12T05:36:16.619360Z"
    }
   },
   "outputs": [],
   "source": [
    "names = names[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:36:34.363197Z",
     "start_time": "2018-11-12T05:36:34.352242Z"
    }
   },
   "outputs": [],
   "source": [
    "%load_ext line_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:36:28.961136Z",
     "start_time": "2018-11-12T05:36:28.954397Z"
    }
   },
   "outputs": [],
   "source": [
    "names[5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:37:17.031886Z",
     "start_time": "2018-11-12T05:37:17.017109Z"
    }
   },
   "outputs": [],
   "source": [
    "%lprun -f encoder.tokenize_claim encoder.tokenize_claim(names[8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:43:04.520738Z",
     "start_time": "2018-11-12T05:37:40.774078Z"
    }
   },
   "outputs": [],
   "source": [
    "article_dict = {}\n",
    "for name in tqdm_notebook(names):\n",
    "    article_dict[name] = encoder.tokenize_claim(name)\n",
    "return article_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:52:29.382634Z",
     "start_time": "2018-11-12T04:52:29.376726Z"
    }
   },
   "outputs": [],
   "source": [
    "def transform(n):\n",
    "    article_dict = {}\n",
    "    for name in n:\n",
    "        article_dict[name] = encoder.tokenize_claim(name)\n",
    "    return article_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:43:01.128405Z",
     "start_time": "2018-11-12T04:43:00.935684Z"
    }
   },
   "outputs": [],
   "source": [
    "from joblib import Parallel, delayed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:52:31.252957Z",
     "start_time": "2018-11-12T04:52:31.247201Z"
    }
   },
   "outputs": [],
   "source": [
    "def chunks(l, n):\n",
    "    \"\"\"Yield successive n-sized chunks from l.\"\"\"\n",
    "    for i in range(0, len(l), n):\n",
    "        yield l[i:i + n]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:54:14.744014Z",
     "start_time": "2018-11-12T04:54:14.737052Z"
    }
   },
   "outputs": [],
   "source": [
    "len(names)//10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T05:10:30.110902Z",
     "start_time": "2018-11-12T05:05:08.569728Z"
    }
   },
   "outputs": [],
   "source": [
    "processed = Parallel(n_jobs=10, verbose=1, prefer=\"threads\")(delayed(transform)(n) for n in chunks(names, len(names)//100000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-12T04:42:12.991673Z",
     "start_time": "2018-11-12T04:40:19.343920Z"
    }
   },
   "outputs": [],
   "source": [
    "transformed = utils.parallel_process(names, encoder.tokenize_claim, n_jobs=10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
